********************************************
* Data cleaning
* Variable Generation and Labeling
********************************************

* Currently, there are 11 different datasets

********************************************************************************
*			1. BASELINE SURVEY
********************************************************************************
* 1. open the 2017 baseline survey data; prefix is "bs_"
use "$rawdata_dir/baseline data 2017/2017_edu_baseline_final.dta", clear

* add the prefix and rename variables
rename q* bs_q*
rename bs_q103* bs_dob_*
rename bs_q105 bs_fatherlive
rename bs_q106 bs_motherlive
rename bs_q107* bs_parent_statement*
rename bs_q108 bs_ethnicity
rename bs_q109 bs_language
rename bs_q202 bs_repeat
rename bs_q203 bs_absent
rename bs_q204 bs_absent_reason
rename bs_q205 bs_owndesk
rename bs_q206* bs_owntxtbook*
rename bs_q207 bs_notxtbook	
rename bs_q208* bs_ownstationary*
rename bs_q209 bs_edugoals_noobs
rename bs_q210 bs_edugoals_obs
rename bs_q211 bs_edumotive1
rename bs_q212 bs_edumotive2
rename bs_q213 bs_edumotive3
rename bs_q214* bs_subjectexp*
rename bs_q215* bs_subjectperf*
rename bs_q216 bs_studywkdays
rename bs_q218 bs_studywkdays_location
rename bs_q219* bs_studywkdays_company*
rename bs_q220 bs_gethw
rename bs_q221 bs_misshw
rename bs_q222 bs_studywkends
rename bs_q224 bs_studywkends_location
rename bs_q225* bs_studywkends_company*
rename bs_q301 bs_meals
rename bs_q302 bs_health
rename bs_q401 bs_work
rename bs_q402 bs_worktype
rename bs_q403 bs_workdays
rename bs_q5* bs_learnmotive*
rename bs_q60* bs_classenviron*
rename bs_q70* bs_teacherpercep*
rename bs_q801 bs_studysummer
rename bs_q802 bs_studysummer_days
rename bs_q804 bs_studysummer_location
rename bs_q805* bs_studysummer_company*
rename bs_q806 bs_worksummer
rename cog_score bs_cog
rename com_score bs_com
rename listen_score_* bs_listen_*

* additional labeling
label variable bs_workdays "Average number of days a week respondent worked in last 30 days"
label drop level_edu
label define level_edu 1 "Primary School" 2 "Secondary School" 3 "Technical/Vocational" 4 "College/University" 5 "Graduate School" 99 "Don't know"
label variable bs_learnmotive04 "respondent has uneasy upset feeling when taking a test"
label variable bs_subjectexpk "respondent studies hard in social studies"
label variable bs_studywkdays "number of days on average respondent studies outside school during weekdays"
label variable bs_studywkends "number of days on average respondent studies outside school during weekends"

* reformat gender as categorical
rename gender gender_old
egen gender = group(gender_old), label
order gender, after(gender_old)
label variable gender "gender"
drop gender_old
generate female = (gender == 1) if gender < .
label variable female "Indicator for female"
order female, after(gender)

* Create age variable
generate bs_age = 2017 - bs_dob_year if bs_dob_year < 9999
label variable bs_age "Age"
order bs_age, after(bs_dob_year)

* create indicator for co-residence with parents
generate bs_bothparents = 0 if inrange(bs_fatherlive, 2, 3) | inrange(bs_motherlive, 2, 3)
replace bs_bothparents = 1 if bs_fatherlive == 1 & bs_motherlive == 1
label variable bs_bothparents "Indicator that student lives with both parents"
order bs_bothparents, after(bs_motherlive)

* create indicator for q108 parent-related statements
foreach i in a b c d e f g h i j k l {
	gen bs_parent_`i'_dum=bs_parent_statement`i'==5 if bs_parent_statement`i'<.
	lab var bs_parent_`i'_dum "answered 'strongly agree' for q108`i'"
	}
egen bs_parent_ab_score = rowtotal(bs_parent_statementa bs_parent_statementb), missing
egen bs_parent_ab_score_z = std(bs_parent_ab_score)

egen bs_parent_cd_score = rowtotal(bs_parent_statementc bs_parent_statementd), missing
egen bs_parent_cd_score_z = std(bs_parent_cd_score)

egen bs_parent_el_score = rowtotal(bs_parent_statemente bs_parent_statementf bs_parent_statementg bs_parent_statementh bs_parent_statementi bs_parent_statementj bs_parent_statementk bs_parent_statementl), missing
egen bs_parent_el_score_z = std(bs_parent_el_score)

* create indicators for ethnicity and language
generate bs_chewa = (bs_ethnicity == 1) if bs_ethnicity < .
label variable bs_chewa "Indicator for Chewa ethnicity"
generate bs_ngoni = (bs_ethnicity == 4) if bs_ethnicity < .
label variable bs_ngoni "Indicator for Nogni ethnicity"
order bs_chewa bs_ngoni, after(bs_ethnicity)
generate bs_chewalang = (bs_language == 1) if bs_language < .
label variable bs_chewalang "Indicator for Chewa language used at home"
order bs_chewalang, after(bs_language)

* create variable for time spent traveling to school
generate bs_schtravel = bs_q201hr*60 + bs_q201min
label variable bs_schtravel "Time spent travelling from home to school (minutes)"
order bs_schtravel, after(bs_q201min)

* create indicator for ever repeat grade
generate bs_ever_repeat = (bs_repeat == 2 | bs_repeat == 3) if bs_repeat < .
label variable bs_ever_repeat "Indicator for ever repeated a grade"
order bs_ever_repeat, after(bs_repeat)

* create indicator for reasons for being absent from school
generate bs_absent_sick = (bs_absent_reason == 1) if bs_absent_reason < .
label variable bs_absent_sick "Indicator for being absent from school due to sickness"
generate bs_absent_work = (bs_absent_reason == 2) if bs_absent_reason < .
label variable bs_absent_work "Indicator for being absent from school due to work"
order bs_absent_sick bs_absent_work, after(bs_absent_reason)

* create indicators for own desk at school 
generate bs_owndesk_never = (bs_owndesk == 1) if bs_owndesk < .
label variable bs_owndesk_never "Indicator for never having own desk at school"
generate bs_owndesk_sometimes = (bs_owndesk == 2) if bs_owndesk < .
label variable bs_owndesk_sometimes "Indicator for sometimes having own desk at school"
generate bs_owndesk_always = (bs_owndesk == 3) if bs_owndesk < .
label variable bs_owndesk_always "Indicator for always having own desk at school"
order bs_owndesk_* , after(bs_owndesk)

* create variable for study materials
egen bs_num_txtbk = rowtotal(bs_owntxtbook*), missing
label variable bs_num_txtbk "Total number of textbooks owned"
order bs_num_txtbk, after(bs_owntxtbookf)
egen bs_num_stat = rowtotal(bs_ownstationary*), missing
label variable bs_num_stat "Total number of stationaries owned"
order bs_num_stat, after(bs_ownstationaryc)

* create indicators for studying without textbook
generate bs_notxtbook_borrow = inrange(bs_notxtbook, 3, 5) if bs_notxtbook < .
label variable bs_notxtbook_borrow "Indicator that student borrow books when he/she does not have textbook"
generate bs_notxtbook_classnotes = (bs_notxtbook == 6) if bs_notxtbook < .
label variable bs_notxtbook_classnotes "Indicator that student studies class notes when he/she does not have textbook"
order bs_notxtbook_*, after(bs_notxtbook)

* create indicators for education goals
generate bs_college_noobs = inrange(bs_edugoals_noobs, 4, 5) if bs_edugoals_noobs < .
label variable bs_college_noobs "Indicator that student wants to complete college if no obstacles"
order bs_college_noobs, after(bs_edugoals_noobs)
generate bs_college_obs = inrange(bs_edugoals_obs, 4, 5) if bs_edugoals_obs < .
label variable bs_college_obs "Indicator that student wants to complete college if face obstacles"
order bs_college_obs, after(bs_edugoals_obs)

* create indicators for education motives
foreach i in bs_edumotive1 bs_edumotive2 bs_edumotive3 {
	gen `i'_dum = `i'==5 if `i'<.
	lab var `i'_dum "responded 'strongly agree' to `i'"
	}
egen bs_edumotive_score=rowtotal(bs_edumotive1 bs_edumotive2 bs_edumotive3), missing
egen bs_edumotive_score_z=std(bs_edumotive_score)

* create indicators for experience on each subject
foreach i in bs_subjectexpa bs_subjectexpb bs_subjectexpc bs_subjectexpd bs_subjectexpe bs_subjectexpf bs_subjectexpg bs_subjectexph bs_subjectexpi bs_subjectexpj bs_subjectexpk bs_subjectexpl {
	gen `i'_dum=`i'==5 if `i'<.
	lab var `i'_dum "responded 'totally true' to `i'"
	}
egen bs_subjectexp_af=rowtotal(bs_subjectexpa-bs_subjectexpf), missing
egen bs_subjectexp_gl=rowtotal(bs_subjectexpg-bs_subjectexpl), missing
foreach i in bs_subjectexp_af bs_subjectexp_gl {
	egen `i'_z=std(`i')
	}
	
* create indicators for school performance
foreach i in bs_subjectperfa bs_subjectperfb bs_subjectperfc bs_subjectperfd bs_subjectperfe bs_subjectperff bs_subjectperfg {
	gen `i'_dum=inrange(`i', 4, 5) if `i'<.
	lab var `i'_dum "self-reported performance for `i' is good or very good"
	}

* create variable for time spent studying outside school on weekdays
generate bs_studywkdays_time = bs_q217hr*60 + bs_q217min
label variable bs_studywkdays_time "Time spent studying outside school on weekdays (minutes)"
order bs_studywkdays_time, after(bs_q217min)

* create indicator for studying at home on weekdays
generate bs_studywkdays_home = (bs_studywkdays_location == 1) if bs_studywkdays_location < .
label variable bs_studywkdays_home "Indicator that student usually studies at home on weekdays"
order bs_studywkdays_home, after(bs_studywkdays_location)

* create variable for time spent studying outside school on weekends
generate bs_studywkends_time = bs_q223hr*60 + bs_q223min
label variable bs_studywkends_time "Time spent studying outside school on weekends (minutes)"
order bs_studywkends_time, after(bs_q223min)

* create indicator for studying at home on weekends
generate bs_studywkends_home = (bs_studywkends_location == 1) if bs_studywkends_location < .
label variable bs_studywkends_home "Indicator that student usually studies at home on weekends"
order bs_studywkends_home, after(bs_studywkends_location)

* create variables for health and nutrition
clonevar bs_meals_cat = bs_meals
replace bs_meals_cat = 3 if bs_meals_cat >= 3 &  bs_meals_cat < .
label define meals_cat 0 "No meals" 1 "1 meal" 2 "2 meals" 3 "3 or more meals"
label values bs_meals_cat meals_cat
label variable bs_meals "Number of meals yesterday"
order bs_meals_cat, after(bs_meals)
generate bs_modhealth = (bs_health <= 3) if bs_health < .
label variable bs_modhealth "Indicator that student has moderate self-reported health or better"
order bs_modhealth, after(bs_health)

* Create indicator for housework
generate bs_work_home = (bs_worktype == 0) if bs_worktype < .
label variable bs_work_home "Indicator that student's main work is housework"
order bs_work_home, after(bs_worktype)

* create variable for time spent working in a day
generate bs_worktime = bs_q404hr*60 + bs_q404min
label variable bs_worktime "Time spent working on a working day (minutes)"
order bs_worktime, after(bs_q404min)

* create variable for time spent studying during summer
generate bs_studysummer_time = bs_q803hr*60 + bs_q803min
label variable bs_studysummer_time "Time spent studying a day during the summer (minutes)"
order bs_studysummer_time, after(bs_q803min)

* create indicator for studying at home during summer
generate bs_studysummer_home = (bs_studysummer_location == 1) if bs_studysummer_location < .
label variable bs_studysummer_home "Indicator that student usually studies at home during summer"
order bs_studysummer_home, after(bs_studysummer_location)

* create indicators for homework
gen bs_gethw_dum=bs_gethw==4 if bs_gethw<.
lab var bs_gethw_dum "Teachers gave homework more than 4 times/month"

gen bs_misshw_dum=bs_misshw==1 if bs_misshw<.
lab var bs_misshw_dum "Never missed homework in a month"

* health and nutrition
gen bs_meals_dum=bs_meals_cat==3 if bs_meals_cat<.
lab var bs_meals_dum "Had 3 or more meals yesterday"
order bs_meals_dum bs_meals_cat, after(bs_meals)

generate bs_goodhealth = (bs_health <= 2) if bs_health < .
label variable bs_goodhealth "Indicator that student has good self-reported health or better"
order bs_goodhealth, after(bs_health)

* create indicators on learning & motivation
foreach i in bs_learnmotive04 bs_learnmotive06 {
	gen `i'R=.
	replace `i'R=5 if `i'==1
	replace `i'R=4 if `i'==2
	replace `i'R=3 if `i'==3
	replace `i'R=2 if `i'==4
	replace `i'R=1 if `i'==5
	}
foreach i in bs_learnmotive01 bs_learnmotive02 bs_learnmotive03 bs_learnmotive04R bs_learnmotive05 bs_learnmotive06R bs_learnmotive07 bs_learnmotive08 bs_learnmotive09 bs_learnmotive10 {
	gen `i'_dum=`i'==5 if `i'<.
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen bs_learnmotive_score=rowtotal(bs_learnmotive01 bs_learnmotive02 bs_learnmotive03 bs_learnmotive04R bs_learnmotive05 bs_learnmotive06R bs_learnmotive07 bs_learnmotive08 bs_learnmotive09 bs_learnmotive10), missing
egen bs_learnmotive_score_z=std(bs_learnmotive_score)

* create indicators on class environments
foreach i in bs_classenviron1 bs_classenviron2 bs_classenviron3 bs_classenviron4 bs_classenviron5 {
	gen `i'_dum = `i'==5 if `i'<.
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen bs_classenv_score=rowtotal(bs_classenviron1-bs_classenviron5), missing
egen bs_classenv_score_z=std(bs_classenv_score)

* create indicators on perception of school teachers
foreach i in bs_teacherpercep1 bs_teacherpercep2 bs_teacherpercep3 bs_teacherpercep4 bs_teacherpercep5 bs_teacherpercep6 bs_teacherpercep7 bs_teacherpercep8 {
	gen `i'_dum = `i'==5 if `i' < .
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen bs_teacherpercep_score=rowtotal(bs_teacherpercep1-bs_teacherpercep8), missing
egen bs_teacherpercep_score_z=std(bs_teacherpercep_score)

* Rosenberg's Self-Esteem Scale scoring
* source: https://www.wwnorton.com/college/psych/psychsci/media/rosenberg.htm
recode bs_q901 bs_q903 bs_q904 bs_q907 bs_q910 (1=0) (2=1) (3=2) (4=3), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4 pos_stat5)
recode bs_q902 bs_q905 bs_q906 bs_q908 bs_q909 (1=3) (2=2) (3=1) (4=0), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4 neg_stat5)
egen bs_selfesteem = rowtotal(pos_stat* neg_stat*), missing
egen selfesteem_ans = rownonmiss(pos_stat* neg_stat*)
generate bs_selfesteem_impute = bs_selfesteem
replace bs_selfesteem_impute = bs_selfesteem_impute*10/9 if selfesteem_ans == 9  // impute for those who answered 9 of 10 questions
replace bs_selfesteem_impute = bs_selfesteem_impute*10/8 if selfesteem_ans == 8  // impute for those who answered 8 of 10 questions
replace bs_selfesteem_impute = . if selfesteem_ans < 8
label variable bs_selfesteem_impute "Rosenberg Self-esteem Scale (imputed)"
replace bs_selfesteem = . if selfesteem_ans < 10  // only score for those who answered all 10 questions
label variable bs_selfesteem "Rosenberg Self-esteem Scale"
order bs_selfesteem bs_selfesteem_impute, after(bs_q910)
drop pos_stat* neg_stat* selfesteem_ans
	
* Short Grit Scale scoring
* source: http://www.sjdm.org/dmidi/files/Grit-8-item.pdf
recode bs_q912 bs_q914 bs_q917 bs_q918 (1=1) (2=2) (3=3) (4=4) (5=5), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4)
recode bs_q911 bs_q913 bs_q915 bs_q916 (1=5) (2=4) (3=3) (4=2) (5=1), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4)
egen bs_grit = rowtotal(pos_stat* neg_stat*), missing
egen grit_ans = rownonmiss(pos_stat* neg_stat*)
generate bs_grit_impute = bs_grit
replace bs_grit_impute = bs_grit_impute*8/7 if grit_ans == 7  // impute for those who answered 7 of 8 questions
replace bs_grit_impute = bs_grit_impute*8/6 if grit_ans == 6  // impute for those who answered 6 of 8 questions
replace bs_grit_impute = . if grit_ans < 6
replace bs_grit_impute = bs_grit_impute / 8
label variable bs_grit_impute "Short Grit Scale (imputed)"
replace bs_grit = . if grit_ans < 8  // only score for those who answered all 8 questions
replace bs_grit = bs_grit / 8
label variable bs_grit "Short Grit Scale"
order bs_grit bs_grit_impute, after(bs_q918)
drop pos_stat* neg_stat* grit_ans

* Big 5 Personality  - Conscientiousness scoring
* source: http://fetzer.org/sites/default/files/images/stories/pdf/selfmeasures/Personality-BigFiveInventory.pdf
recode bs_q919 bs_q921 bs_q924 bs_q925 bs_q926(1=1) (2=2) (3=3) (4=4) (5=5), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4 pos_stat5)
recode bs_q920 bs_q922 bs_q923 bs_q927 (1=5) (2=4) (3=3) (4=2) (5=1), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4)
egen bs_conscientious = rowtotal(pos_stat* neg_stat*), missing
egen conscientious_ans = rownonmiss(pos_stat* neg_stat*)
generate bs_conscientious_impute = bs_conscientious
replace bs_conscientious_impute = bs_conscientious_impute*9/8 if conscientious_ans == 8  // impute for those who answered 8 of 9 questions
replace bs_conscientious_impute = bs_conscientious_impute*9/7 if conscientious_ans == 7  // impute for those who answered 7 of 9 questions
replace bs_conscientious_impute = . if conscientious_ans < 7
label variable bs_conscientious_impute "Conscientiousness Scale (imputed)"
replace bs_conscientious = . if conscientious_ans < 8  // only score for those who answered all 9 questions
label variable bs_conscientious "Big 5 Personality  - Conscientiousness scoring"
order bs_conscientious bs_conscientious_impute, after(bs_q927)
drop pos_stat* neg_stat* conscientious_ans

* create standardized test scores by grade
foreach var of varlist bs_cog bs_com bs_listen_chi bs_listen_eng {
	egen `var'z_4 = std(`var') if std == 4
	egen `var'z_5 = std(`var') if std == 5
	egen `var'z = rowmin(`var'z_*)
	local temp "`: var label `var''"
	label variable `var'z "`temp' (standardized by grade)"
	drop `var'z_*
}

keep id-village bs_*

save "$cleaned_dir/baseline_cleaned.dta", replace


********************************************************************************
*			2. WEEKLY ATTENDANCE
********************************************************************************
* 2. open the weekly attendance data
use "$rawdata_dir/Weekly Test/summer_attend.dta", clear

* rename variable
ren summeroffer3 summer_school
ren id_teacher summer_teacher_id

* additional labeling 
label variable summer_school "Summer school offer indicator"

* create number of attendance; sum up from day_1 to day_30
egen num_attend = rowtotal(day_1-day_30)   // Assume missing values are zero
label variable num_attend "Total number of attendance over 30 days"

* create attendance rate
gen rate_attend=num_attend/30
lab var rate_attend "Attendance rate over 30 days"

* create categorical variable for summer school language
egen summer_language = group(Lang3), label
label variable summer_language "Language of instruction during the summer school"

keep id num_attend rate_attend summer_*

save "$cleaned_dir/summer_attendance_cleaned.dta", replace


********************************************************************************
*			3. WEEKLY TEST DURING SUMMER PROGRAM
********************************************************************************
* 3. open weekly test data of the Summer Program 
use "$rawdata_dir/Weekly Test/summer_weekly_test.dta", replace

* reformat next grade status as cateogrical
rename repeat next_grade
rename next_grade next_grade_old
egen next_grade = group(next_grade_old), label
order next_grade, after(next_grade_old)
drop next_grade_old
label variable next_grade "whether the learner progressed to the next grade or not"

* create standardized test scores by grade
foreach var of varlist math_* social_* {
	egen `var'z_4 = std(`var') if std == 4
	egen `var'z_5 = std(`var') if std == 5
	egen `var'z = rowmin(`var'z_*)
	local temp "`: var label `var''"
	label variable `var'z "`temp' (standardized by grade)"
	drop `var'z_*
}

* create total score across all weeks and standardize
foreach i in math social {
	egen `i'_total = rowmean(`i'_1  `i'_2  `i'_3  `i'_4  `i'_Final)
	lab var `i'_total "Average `i' score, 1-5 weeks)"
	egen `i'_totalz_4 = std(`i'_total) if std==4
	egen `i'_totalz_5 = std(`i'_total) if std==5
	egen `i'_totalz = rowmin(`i'_totalz_*)
	lab var `i'_totalz "Average `i' score, 1-5 weeks, standardized by grade"
	}
	

keep id math* social* next_grade

save "$cleaned_dir/summer_tests_cleaned.dta", replace

********************************************************************************
*			4. EXIT SURVEY
********************************************************************************
* 4. open the exit survey of the Summer Program (participant survey); prefix is "es_"
use "$rawdata_dir/exit survey/learner_survey.dta", clear

* add the prefix
ren q* es_q*

* making dummies of some outcome variables
tab es_q103, nolabel m
gen es_q103_dum=es_q103>=3
replace es_q103_dum=. if es_q103==.
lab var es_q103_dum "Summer school program helpful for study (1=Yes 0=No)"
tab es_q103_dum, m

tab es_q105, nolabel m
gen es_q105_chi=(es_q105==1 | es_q105==2)
gen es_q105_both=es_q105==3
gen es_q105_eng=(es_q105==4 | es_q105==5)
foreach i in chi both eng {
	replace es_q105_`i'=. if es_q105==.
	lab var es_q105_`i' "`i' language mainly spoken by teacher in the class"
	}
	

	
foreach q in 106 110 112 {
	tab es_q`q', nolabel m
	replace es_q`q'=0 if es_q`q'==2
	tab es_q`q', m
}

tab es_q107, nolabel m
gen es_q107_dum=(es_q107<=2)
replace es_q107_dum=. if es_q107==.
lab var es_q107_dum "Asked Qs more at least 1 time per day"

foreach q in 108 109 113 {
	tab es_q`q', m
	tab es_q`q', nolabel m
	gen es_q`q'_chi=es_q`q'==1
	gen es_q`q'_eng=es_q`q'==2
	foreach i in chi eng {
		replace es_q`q'_`i'=. if es_q`q'==.
		}
	}
	
tab es_q111, m
tab es_q111, nolab m
gen es_q111_dum=es_q111>=4
replace es_q111_dum=. if es_q111==.
lab var es_q111_dum "Satisfied with summer school"

keep id es_q*


save "$cleaned_dir/exitsurvey_cleaned.dta", replace


*********************************************************************************
*			5. 1ST FOLLOW-UP SURVEY
********************************************************************************
* 5. open the 1st follow up survey data
use "$rawdata_dir/follow-up data 2017/2017_followup_survey_final.dta", clear

* rename variables
rename q*_fol fs_q*
rename school_code fs_school_id
rename std fs_std
rename class fs_class
rename fs_q102a fs_fatheralive
rename fs_q102b fs_motheralive
rename fs_q104day fs_dob_day
rename fs_q104month fs_dob_month
rename fs_q104year fs_dob_year
rename fs_q104age fs_age
rename fs_q106 fs_fatherlive
rename fs_q107 fs_motherlive
rename fs_q108* fs_parent_statement*
rename fs_q109 fs_ethnicity
rename fs_q110 fs_language
rename fs_q202 fs_repeat
rename fs_q203 fs_absent
rename fs_q204 fs_absent_reason
rename fs_q205 fs_owndesk
rename fs_q206* fs_owntxtbook*
rename fs_q207 fs_notxtbook	
rename fs_q208* fs_ownstationary*
rename fs_q209 fs_edugoals_noobs
rename fs_q210 fs_edugoals_obs
rename fs_q211 fs_edumotive1
rename fs_q212 fs_edumotive2
rename fs_q213 fs_edumotive3
rename fs_q214* fs_subjectexp*
rename fs_q215* fs_subjectperf*
rename fs_q216 fs_studywkdays
rename fs_q218 fs_studywkdays_location
rename fs_q219* fs_studywkdays_company*
rename fs_q220 fs_gethw
rename fs_q221 fs_misshw
rename fs_q222 fs_studywkends
rename fs_q224 fs_studywkends_location
rename fs_q225* fs_studywkends_company*
rename fs_q301 fs_meals
rename fs_q302 fs_health
rename fs_q4* fs_learnmotive*
rename fs_q50* fs_classenviron*
rename fs_q60* fs_teacherpercep*
rename fs_q701 fs_studysummer_days
rename fs_q703 fs_studysummer_location
rename fs_q704* fs_studysummer_company*
rename fs_q705 fs_worksummer
rename fs_q706 fs_affsummerprog
ren cog_score_fol fs_cog
ren com_score_fol fs_com
ren fs_q1101 fs_svymath
ren fs_q1102 fs_svysocial

* additional labeling 
label drop yes_no
label define yes_no 0 "no" 1 "yes" 99 "unknown"
label values fs_*alive yes_no
label variable fs_age "Age"
label define ethnicity 1 "chewa" 2 "lomwe" 3 "yao" 4 "ngoni" 5 "tumbuka" 6 "other"
label values fs_ethnicity fs_language ethnicity
label drop level_edu
label define level_edu 1 "Primary School" 2 "Secondary School" 3 "Technical/Vocational" 4 "College/University" 5 "Graduate School" 99 "Don't know"
label variable fs_learnmotive04 "respondent has uneasy upset feeling when taking a test"
label variable fs_subjectexpk "respondent studies hard in social studies"
label variable fs_studywkdays "number of days on average respondent studies outside school during weekdays"
label variable fs_studywkends "number of days on average respondent studies outside school during weekends"

* reformat gender as categorical
rename gender gender_old
egen fs_gender = group(gender_old), label
order fs_gender, after(gender_old)
label variable fs_gender "gender"
drop gender_old
generate fs_female = (fs_gender == 1) if fs_gender < .
label variable fs_female "Indicator for female"
order fs_female, after(fs_gender)

* create indicator for q108 parent-related statements
foreach i in a b c d e f g h i j k l {
	gen fs_parent_`i'_dum=fs_parent_statement`i'==5 if fs_parent_statement`i'<.
	lab var fs_parent_`i'_dum "answered 'strongly agree' for q108`i'"
	}
egen fs_parent_ab_score = rowtotal(fs_parent_statementa fs_parent_statementb), missing
egen fs_parent_ab_score_z = std(fs_parent_ab_score)

egen fs_parent_cd_score = rowtotal(fs_parent_statementc fs_parent_statementd), missing
egen fs_parent_cd_score_z = std(fs_parent_cd_score)

egen fs_parent_el_score = rowtotal(fs_parent_statemente fs_parent_statementf fs_parent_statementg fs_parent_statementh fs_parent_statementi fs_parent_statementj fs_parent_statementk fs_parent_statementl), missing
egen fs_parent_el_score_z = std(fs_parent_el_score)


* create indicator for co-residence with parents
generate fs_bothparents = 0 if inrange(fs_fatherlive, 2, 3) | inrange(fs_motherlive, 2, 3)
replace fs_bothparents = 1 if fs_fatherlive == 1 & fs_motherlive == 1
label variable fs_bothparents "Indicator that student lives with both parents"
order fs_bothparents, after(fs_motherlive)

* create indicators for ethnicity and language
generate fs_chewa = (fs_ethnicity == 1) if fs_ethnicity < .
label variable fs_chewa "Indicator for Chewa ethnicity"
generate fs_ngoni = (fs_ethnicity == 4) if fs_ethnicity < .
label variable fs_ngoni "Indicator for Nogni ethnicity"
order fs_chewa fs_ngoni, after(fs_ethnicity)
generate fs_chewalang = (fs_language == 1) if fs_language < .
label variable fs_chewalang "Indicator for Chewa language used at home"
order fs_chewalang, after(fs_language)

* create variable for time spent traveling to school
generate fs_schtravel = fs_q201hr*60 + fs_q201min
label variable fs_schtravel "Time spent travelling from home to school (minutes)"
order fs_schtravel, after(fs_q201min)

* create indicator for ever repeat grade
generate fs_ever_repeat = (fs_repeat == 2 | fs_repeat == 3) if fs_repeat < .
label variable fs_ever_repeat "Indicator for ever repeated a grade"
order fs_ever_repeat, after(fs_repeat)

* create indicator for reasons for being absent from school
generate fs_absent_sick = (fs_absent_reason == 1) if fs_absent_reason < .
label variable fs_absent_sick "Indicator for being absent from school due to sickness"
generate fs_absent_work = (fs_absent_reason == 2) if fs_absent_reason < .
label variable fs_absent_work "Indicator for being absent from school due to work"
order fs_absent_sick fs_absent_work, after(fs_absent_reason)

* create indicators for own desk at school 
generate fs_owndesk_never = (fs_owndesk == 1) if fs_owndesk < .
label variable fs_owndesk_never "Indicator for never having own desk at school"
generate fs_owndesk_sometimes = (fs_owndesk == 2) if fs_owndesk < .
label variable fs_owndesk_sometimes "Indicator for sometimes having own desk at school"
generate fs_owndesk_always = (fs_owndesk == 3) if fs_owndesk < .
label variable fs_owndesk_always "Indicator for always having own desk at school"
order fs_owndesk_* , after(fs_owndesk)

* create variable for study materials
egen fs_num_txtbk = rowtotal(fs_owntxtbook*), missing
label variable fs_num_txtbk "Total number of textbooks owned"
order fs_num_txtbk, after(fs_owntxtbookf)
egen fs_num_stat = rowtotal(fs_ownstationary*), missing
label variable fs_num_stat "Total number of stationaries owned"
order fs_num_stat, after(fs_ownstationaryc)

* create indicators for studying without textbook
generate fs_notxtbook_borrow = inrange(fs_notxtbook, 3, 5) if fs_notxtbook < .
label variable fs_notxtbook_borrow "Indicator that student borrow books when he/she does not have textbook"
generate fs_notxtbook_classnotes = (fs_notxtbook == 6) if fs_notxtbook < .
label variable fs_notxtbook_classnotes "Indicator that student studies class notes when he/she does not have textbook"
order fs_notxtbook_*, after(fs_notxtbook)

* create indicators for education goals
generate fs_college_noobs = inrange(fs_edugoals_noobs, 4, 5) if fs_edugoals_noobs < .
label variable fs_college_noobs "Indicator that student wants to complete college if no obstacles"
order fs_college_noobs, after(fs_edugoals_noobs)
generate fs_college_obs = inrange(fs_edugoals_obs, 4, 5) if fs_edugoals_obs < .
label variable fs_college_obs "Indicator that student wants to complete college if face obstacles"
order fs_college_obs, after(fs_edugoals_obs)

* create indicators for education motives
foreach i in fs_edumotive1 fs_edumotive2 fs_edumotive3 {
	gen `i'_dum = `i'==5 if `i'<.
	lab var `i'_dum "responded 'strongly agree' to `i'"
	}
egen fs_edumotive_score=rowtotal(fs_edumotive1 fs_edumotive2 fs_edumotive3), missing
egen fs_edumotive_score_z=std(fs_edumotive_score)

* create indicators for experience on each subject
foreach i in fs_subjectexpa fs_subjectexpb fs_subjectexpc fs_subjectexpd fs_subjectexpe fs_subjectexpf fs_subjectexpg fs_subjectexph fs_subjectexpi fs_subjectexpj fs_subjectexpk fs_subjectexpl {
	gen `i'_dum=`i'==5 if `i'<.
	lab var `i'_dum "responded 'totally true' to `i'"
	}
egen fs_subjectexp_af=rowtotal(fs_subjectexpa-fs_subjectexpf), missing
egen fs_subjectexp_gl=rowtotal(fs_subjectexpg-fs_subjectexpl), missing
foreach i in fs_subjectexp_af fs_subjectexp_gl {
	egen `i'_z=std(`i')
	}
	
* create indicators for school performance
foreach i in fs_subjectperfa fs_subjectperfb fs_subjectperfc fs_subjectperfd fs_subjectperfe fs_subjectperff fs_subjectperfg {
	gen `i'_dum=inrange(`i', 4, 5) if `i'<.
	lab var `i'_dum "self-reported performance for `i' is good or very good"
	}

* create variable for time spent studying outside school on weekdays
generate fs_studywkdays_time = fs_q217hr*60 + fs_q217min
label variable fs_studywkdays_time "Time spent studying outside school on weekdays (minutes)"
order fs_studywkdays_time, after(fs_q217min)

* create indicator for studying at home on weekdays
generate fs_studywkdays_home = (fs_studywkdays_location == 1) if fs_studywkdays_location < .
label variable fs_studywkdays_home "Indicator that student usually studies at home on weekdays"
order fs_studywkdays_home, after(fs_studywkdays_location)

* create variable for time spent studying outside school on weekends
generate fs_studywkends_time = fs_q223hr*60 + fs_q223min
label variable fs_studywkends_time "Time spent studying outside school on weekends (minutes)"
order fs_studywkends_time, after(fs_q223min)

* create indicator for studying at home on weekends
generate fs_studywkends_home = (fs_studywkends_location == 1) if fs_studywkends_location < .
label variable fs_studywkends_home "Indicator that student usually studies at home on weekends"
order fs_studywkends_home, after(fs_studywkends_location)

* create indicators for homework
gen fs_gethw_dum=fs_gethw==4 if fs_gethw<.
lab var fs_gethw_dum "Teachers gave homework more than 4 times/month"

gen fs_misshw_dum=fs_misshw==1 if fs_misshw<.
lab var fs_misshw_dum "Never missed homework in a month"

* create variables for health and nutrition
clonevar fs_meals_cat = fs_meals
replace fs_meals_cat = 3 if fs_meals_cat >= 3 &  fs_meals_cat < .
label define meals_cat 0 "No meals" 1 "1 meal" 2 "2 meals" 3 "3 or more meals"
label values fs_meals_cat meals_cat
label variable fs_meals "Number of meals yesterday"

gen fs_meals_dum=fs_meals_cat==3 if fs_meals_cat<.
lab var fs_meals_dum "Had 3 or more meals yesterday"
order fs_meals_dum fs_meals_cat, after(fs_meals)

generate fs_goodhealth = (fs_health <= 2) if fs_health < .
label variable fs_goodhealth "Indicator that student has good self-reported health or better"
order fs_goodhealth, after(fs_health)

* create indicators on learning & motivation
foreach i in fs_learnmotive04 fs_learnmotive06 {
	gen `i'R=.
	replace `i'R=5 if `i'==1
	replace `i'R=4 if `i'==2
	replace `i'R=3 if `i'==3
	replace `i'R=2 if `i'==4
	replace `i'R=1 if `i'==5
	}
foreach i in fs_learnmotive01 fs_learnmotive02 fs_learnmotive03 fs_learnmotive04R fs_learnmotive05 fs_learnmotive06R fs_learnmotive07 fs_learnmotive08 fs_learnmotive09 fs_learnmotive10 {
	gen `i'_dum=`i'==5 if `i'<.
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen fs_learnmotive_score=rowtotal(fs_learnmotive01 fs_learnmotive02 fs_learnmotive03 fs_learnmotive04R fs_learnmotive05 fs_learnmotive06R fs_learnmotive07 fs_learnmotive08 fs_learnmotive09 fs_learnmotive10), missing
egen fs_learnmotive_score_z=std(fs_learnmotive_score)

* create indicators on class environments
foreach i in fs_classenviron1 fs_classenviron2 fs_classenviron3 fs_classenviron4 fs_classenviron5 {
	gen `i'_dum = `i'==5 if `i'<.
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen fs_classenv_score=rowtotal(fs_classenviron1-fs_classenviron5), missing
egen fs_classenv_score_z=std(fs_classenv_score)

* create indicators on perception of school teachers
foreach i in fs_teacherpercep1 fs_teacherpercep2 fs_teacherpercep3 fs_teacherpercep4 fs_teacherpercep5 fs_teacherpercep6 fs_teacherpercep7 fs_teacherpercep8 {
	gen `i'_dum = `i'==5 if `i' < .
	lab var `i'_dum "responded 'very true' for `i'"
	}
egen fs_teacherpercep_score=rowtotal(fs_teacherpercep1-fs_teacherpercep8), missing
egen fs_teacherpercep_score_z=std(fs_teacherpercep_score)
	
* create indicator for not studying during the summer
generate fs_summernostudy = (fs_studysummer_days == 1) if fs_studysummer_days < .
label variable fs_summernostudy "Indicator for never studying during the summer holiday"
order fs_summernostudy, after(fs_studysummer_days)

* create variable for time spent studying during summer
generate fs_studysummer_time = fs_q702hr*60 + fs_q702min
label variable fs_studysummer_time "Time spent studying a day during the summer (minutes)"
order fs_studysummer_time, after(fs_q702min)

* create indicator for studying at home during summer
generate fs_studysummer_home = (fs_studysummer_location == 1) if fs_studysummer_location < .
label variable fs_studysummer_home "Indicator that student usually studies at home during summer"
order fs_studysummer_home, after(fs_studysummer_location)

* clean whether participated in summer program indicator
tab fs_affsummerprog, m nol
replace fs_affsummerprog=. if fs_affsummerprog==3

* Rosenberg's Self-Esteem Scale scoring
* source: https://www.wwnorton.com/college/psych/psychsci/media/rosenberg.htm
recode fs_q801 fs_q803 fs_q804 fs_q807 fs_q810 (1=0) (2=1) (3=2) (4=3), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4 pos_stat5)
recode fs_q802 fs_q805 fs_q806 fs_q808 fs_q809 (1=3) (2=2) (3=1) (4=0), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4 neg_stat5)
egen fs_selfesteem = rowtotal(pos_stat* neg_stat*), missing
egen selfesteem_ans = rownonmiss(pos_stat* neg_stat*)
generate fs_selfesteem_impute = fs_selfesteem
replace fs_selfesteem_impute = fs_selfesteem_impute*10/9 if selfesteem_ans == 9  // impute for those who answered 9 of 10 questions
replace fs_selfesteem_impute = fs_selfesteem_impute*10/8 if selfesteem_ans == 8  // impute for those who answered 8 of 10 questions
replace fs_selfesteem_impute = . if selfesteem_ans < 8
label variable fs_selfesteem_impute "Rosenberg Self-esteem Scale (imputed)"
replace fs_selfesteem = . if selfesteem_ans < 10  // only score for those who answered all 10 questions
label variable fs_selfesteem "Rosenberg Self-esteem Scale"
order fs_selfesteem fs_selfesteem_impute, after(fs_q810)
drop pos_stat* neg_stat* selfesteem_ans
	
* Short Grit Scale scoring
* source: http://www.sjdm.org/dmidi/files/Grit-8-item.pdf
recode fs_q812 fs_q814 fs_q817 fs_q818 (1=1) (2=2) (3=3) (4=4) (5=5), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4)
recode fs_q811 fs_q813 fs_q815 fs_q816 (1=5) (2=4) (3=3) (4=2) (5=1), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4)
egen fs_grit = rowtotal(pos_stat* neg_stat*), missing
egen grit_ans = rownonmiss(pos_stat* neg_stat*)
generate fs_grit_impute = fs_grit
replace fs_grit_impute = fs_grit_impute*8/7 if grit_ans == 7  // impute for those who answered 7 of 8 questions
replace fs_grit_impute = fs_grit_impute*8/6 if grit_ans == 6  // impute for those who answered 6 of 8 questions
replace fs_grit_impute = . if grit_ans < 6
replace fs_grit_impute = fs_grit_impute / 8
label variable fs_grit_impute "Short Grit Scale (imputed)"
replace fs_grit = . if grit_ans < 8  // only score for those who answered all 8 questions
replace fs_grit = fs_grit / 8
label variable fs_grit "Short Grit Scale"
order fs_grit fs_grit_impute, after(fs_q818)
drop pos_stat* neg_stat* grit_ans

* Big 5 Personality  - Conscientiousness scoring
* source: http://fetzer.org/sites/default/files/images/stories/pdf/selfmeasures/Personality-BigFiveInventory.pdf
recode fs_q819 fs_q821 fs_q824 fs_q825 fs_q826(1=1) (2=2) (3=3) (4=4) (5=5), ///
	generate(pos_stat1 pos_stat2 pos_stat3 pos_stat4 pos_stat5)
recode fs_q820 fs_q822 fs_q823 fs_q827 (1=5) (2=4) (3=3) (4=2) (5=1), ///
	generate(neg_stat1 neg_stat2 neg_stat3 neg_stat4)
egen fs_conscientious = rowtotal(pos_stat* neg_stat*), missing
egen conscientious_ans = rownonmiss(pos_stat* neg_stat*)
generate fs_conscientious_impute = fs_conscientious
replace fs_conscientious_impute = fs_conscientious_impute*9/8 if conscientious_ans == 8  // impute for those who answered 8 of 9 questions
replace fs_conscientious_impute = fs_conscientious_impute*9/7 if conscientious_ans == 7  // impute for those who answered 7 of 9 questions
replace fs_conscientious_impute = . if conscientious_ans < 7
label variable fs_conscientious_impute "Conscientiousness Scale (imputed)"
replace fs_conscientious = . if conscientious_ans < 8  // only score for those who answered all 9 questions
label variable fs_conscientious "Big 5 Personality  - Conscientiousness scoring"
order fs_conscientious fs_conscientious_impute, after(fs_q827)
drop pos_stat* neg_stat* conscientious_ans

* Fix missing values for survey test scores
replace fs_svymath = . if fs_svymath == 99
replace fs_svysocial = . if fs_svysocial == 99

* create standardized test scores by grade
foreach var of varlist fs_svymath-fs_com {
	egen `var'z_4 = std(`var') if fs_std == 4
	egen `var'z_5 = std(`var') if fs_std == 5
	egen `var'z_6 = std(`var') if fs_std == 6
	egen `var'z = rowmin(`var'z_*)
	local temp "`: var label `var''"
	label variable `var'z "`temp' (standardized by grade)"
	drop `var'z_*
}

keep id fs_* 

save "$cleaned_dir/followup_cleaned.dta", replace

********************************************************************************
*			6. PARENT SURVEY
********************************************************************************
* 6. open the summer school participant's parent survey; prefix is "ps_"
use "$rawdata_dir/parent survey/Parent survey 2017.dta", clear

* add the prefix
ren q* ps_q*

* making dummy variable for q201-205
forval i=1/5 {
	gen ps_q20`i'_dum=.
	replace ps_q20`i'_dum=1 if ps_q20`i'>3 & ps_q20`i'!=.
	replace ps_q20`i'_dum=0 if ps_q20`i'<=3 
	}
	
lab var ps_q201_dum "Agree that my child is highly interested in learning"
lab var ps_q202_dum "Agree that my child completes assignments on time"
lab var ps_q203_dum "Agree that higher educ is the best way for better life"
lab var ps_q204_dum "Agree that I can help child with study"
lab var ps_q205_dum "Agree that I am satisfied with child's overall performance"

* making aggregate score for q201-205
egen ps_q201_5_total = rowtotal(ps_q201 ps_q202 ps_q203 ps_q204 ps_q205), missing
egen ps_q201_5_total_z = std(ps_q201_5_total)
lab var ps_q201_5_total "Total score for parents' view on learners' learning"
lab var ps_q201_5_total_z "Total score for parents' view on learners' learning (standardized)"

* making dummy variable for q206-7
gen ps_q206_dum = .
replace ps_q206_dum = 1 if ps_q206>=3 & ps_q206!=.
replace ps_q206_dum = 0 if ps_q206<3

lab var ps_q206_dum "Help child with study at least sometimes"

recode ps_q207 (1=1) (2=0)

* making dummy/total variables for q301-304
forval i=1/3 {
	gen ps_q30`i'_dum=.
	replace ps_q30`i'_dum=1 if ps_q30`i'>3 & ps_q30`i'!=.
	replace ps_q30`i'_dum=0 if ps_q30`i'<=3
	}

lab var ps_q301_dum "Agree that teachers provided equitable curriculum per learning needs"
lab var ps_q302_dum "Agree that teachers meet indv learning needs by indv'zing instruction"
lab var ps_q303_dum "Agree that teachers help understand child's progress"

egen ps_q301_3_total = rowtotal(ps_q301 ps_q302 ps_q303), missing
egen ps_q301_3_total_z = std(ps_q301_3_total)

lab var ps_q301_3_total "Total score for parents' view on teachers"
lab var ps_q301_3_total_z "Total score for parents' view on teachers (standardized)"

gen ps_q304_dum=.
replace ps_q304_dum=1 if ps_q304>2 & ps_q304!=.
replace ps_q304_dum=0 if ps_q304<=2

lab var ps_q304_dum "Informed by teacher on child performance at least twice a term"

* making dummy/total variables for q401-404
forval i=1/3{
	gen ps_q40`i'_dum=.
	replace ps_q40`i'_dum=1 if ps_q40`i'>3 & ps_q40`i'!=.
	replace ps_q40`i'_dum=0 if ps_q40`i'<=3
	}
	
recode ps_q404 (1=5) (2=4) (4=2) (5=1), gen(ps_q404R)
	
gen ps_q404_dum=.
	replace ps_q404_dum=1 if ps_q404R>3 & ps_q404R!=.
	replace ps_q404_dum=0 if ps_q404R<=3
	
egen ps_q401_4_total = rowtotal(ps_q401 ps_q402 ps_q403 ps_q404R), missing
egen ps_q401_4_total_z = std(ps_q401_4_total)
	
lab var ps_q401_dum "Agree that good educ will help get ahead in life"
lab var ps_q402_dum "Agree that child learns impt skills at school"
lab var ps_q403_dum "Agree that better educ qual help get better jobs"
lab var ps_q404_dum "Disagree that schooling takes away time from earning"
lab var ps_q401_4_total "Total score for parents' view on education"
lab var ps_q401_4_total_z "Total score for parents' view on educ (standardized)"

* making dummy/total variables for q501-508
forval i=1/3 {
	gen ps_q50`i'_dum=.
	replace ps_q50`i'_dum=1 if ps_q50`i'>3 & ps_q50`i'!=.
	replace ps_q50`i'_dum=0 if ps_q50`i'<=3
	}
egen ps_q501_3_total = rowtotal(ps_q501 ps_q502 ps_q503), missing
egen ps_q501_3_total_z = std(ps_q501_3_total)

lab var ps_q501_dum "Agree that child enjoyed attending HS"
lab var ps_q502_dum "Agree that HS helpd my child improve academic skill"
lab var ps_q503_dum "Agree that HS has pos eff on attitude toward learning"
lab var ps_q501_3_total "Total score for benefits of Holiday School"
lab var ps_q501_3_total_z "Total score for benefits of Holiday School (standardized)"

recode ps_q505 (1=1) (2=0)

gen ps_q506_dum=.
replace ps_q506_dum=1 if ps_q506>2 & ps_q506!=.
replace ps_q506_dum=0 if ps_q506<=2

lab var ps_q506_dum "Willing to pay1000 kwacha or more for the holiday school"

gen ps_q508_dum=.
replace ps_q508_dum=1 if ps_q508>3 & ps_q508!=.
replace ps_q508_dum=0 if ps_q508<=3

lab var ps_q508_dum "Child had breakfast 4 or more times last week"


keep id ps_q* 

save "$cleaned_dir/parent_cleaned.dta", replace

********************************************************************************
*			7. TERM 3 SCORE (BASELINE EXAM SCORES)
********************************************************************************
* 7. open the 2016-2017 Academic Year Term 3 Exam Score data; prefix is "bs_"
use "$rawdata_dir/2016-2017 3term exam score/term3 score_appended.dta", clear

* additional labelling
label variable bs_sci "Primary Science for STD5 and Agriculture for STD4"
label variable bs_total_std4 "Total score for 16-17 3term exam for standard 4 (out of 550)"
label variable bs_total_std5 "Total score for 16-17 3term exam for standard 5 (out of 600)"

* create standardized test scores by grade
foreach var of varlist bs_chi-bs_total_std5 {
	egen `var'z_4 = std(`var') if std == 4
	egen `var'z_5 = std(`var') if std == 5
	egen `var'z = rowmin(`var'z_*)
	local temp "`: var label `var''"
	label variable `var'z "`temp' (standardized by grade)"
	drop `var'z_*
}

generate bs_totalz = bs_total_std5z if std == 5
replace  bs_totalz = bs_total_std4z if std == 4

keep id bs_*

save "$cleaned_dir/2016T3score_cleaned.dta", replace

********************************************************************************
*			8. SUMMER SCHOOL TEACHER SURVEY
********************************************************************************
* 8. open the Summer School Teacher Survey data; prefix is "ts_"
use "$rawdata_dir/fu_sumteacher survey/teacher_survey_merged_final.dta", clear

* rename variables
ren id_teacher summer_teacher_id
ren tbs_q104 ts_age

* create variable 
gen ts_language = 1 if language=="ENGLISH"
replace ts_language = 0 if language=="CHICHEWA"
label variable ts_language "1 if teacher used English for teaching 0 if teaching in Chichewa"

* create variables for teaching experience (round down)
generate ts_totexp = 0 if tbs_q106 == 1
replace ts_totexp = int(tbs_q106_specify) if tbs_q106 == 2
label variable ts_totexp "Total years of teaching experience (rounded down)"

* create indicator for female teacher
// Assume tfu_gen_teacher == 1 is male
generate ts_female = (tfu_gen_teacher == 2) if tfu_gen_teacher < .
label variable ts_female "Indicator for female teacher"
order ts_female, after(tfu_gen_teacher)

* making dummy variables for some teacher outcomes
gen tfu_q202_dum=tfu_q202==6 if tfu_q202<.
lab var tfu_q202_dum "kept attendance almost always (=1)"

tab tfu_q204, m
gen tfu_q204_dum=tfu_q204==5 if tfu_q204<.
lab var tfu_q204_dum "conducted group activities more than 6 times/week"

foreach i in tfu_q205 tfu_q206 tfu_q207 tfu_q208 tfu_q209 {
	tab `i', m
	replace `i'=4 if `i'==5
	gen `i'_dum=`i'==4 if `i'<.
	lab var `i'_dum "answered to large extent for `i'"
	}
egen tfu_tch_score=rowtotal(tfu_q205 tfu_q206 tfu_q207 tfu_q208 tfu_q209)
egen tfu_tch_score_z=std(tfu_tch_score)

tab tfu_q210, m
gen tfu_q210_dum=tfu_q210>=3 if tfu_q210<.
lab var tfu_q210_dum "has most or all resources needed for instruction"

foreach i in tfu_q211 tfu_q212 tfu_q213 {
	tab `i', m
	gen `i'_chi=`i'<=2 if `i'<.
	gen `i'_both=`i'==3 if `i'<.
	gen `i'_eng=`i'>=4 if `i'<.
	}
	
foreach i in tfu_q211 tfu_q212 tfu_q213 {
	gen `i'_cor=0
	replace `i'_cor=1 if tfu_q214==1 & `i'_chi==1
	replace `i'_cor=1 if tfu_q214==2 & `i'_eng==1
	}

gen tfu_q214_dum=0
replace tfu_q214_dum=1 if tfu_q214_a==4
replace tfu_q214_dum=1 if tfu_q214_b==4
lab var tfu_q214_dum "Never allowed learners to ask Qs in the other language"
	
foreach i in tfu_q218 tfu_q219 tfu_q220 tfu_q221 tfu_q222 tfu_q223 tfu_q224 tfu_q225 tfu_q226 tfu_q227 tfu_q228 tfu_q229 tfu_q230 {
	tab `i', m
	gen `i'_dum=`i'==5 if `i'<.
	lab var `i'_dum "answered 'Totally agree' for `i'"
	}
egen tfu_fb_score=rowtotal(tfu_q218 tfu_q219 tfu_q220 tfu_q221 tfu_q222 tfu_q223 tfu_q224 tfu_q225 tfu_q226 tfu_q227 tfu_q228 tfu_q229 tfu_q230)
egen tfu_fb_score_z=std(tfu_fb_score)


* tfu_q201 tfu_q202_dum tfu_q204_dum tfu_tch_score_z tfu_q210_dum tfu_q211_chi tfu_q211_both tfu_q211_eng tfu_q212_chi tfu_q212_both tfu_q212_eng tfu_q213_chi tfu_q213_both tfu_q213_eng tfu_q211_cor tfu_q212_cor tfu_q213_cor tfu_q214_dum tfu_q218_dum tfu_q219_dum tfu_q220_dum tfu_q221_dum tfu_q222_dum tfu_q223_dum tfu_q224_dum tfu_q225_dum tfu_q226_dum tfu_q227_dum tfu_q228_dum tfu_q229_dum tfu_q230_dum tfu_fb_score_z
* (no 203 becasue all answered yes)

* additional labeling 
/*
label define keeprecords 1 "Less than once a week" 2 "About once a week" 3 "About twice a week" 4 "About 3 times a week" 5 "About 4 times a week" 6 "Almost always"
label values ts_q202 keeprecords
label define yes_no 0 "No" 1 "Yes"
label values ts_q203 yes_no
label define org_grp_activities 1 "Less than once" 2 "1~2 times" 3 "3~4 times" 4 "5~6 times" 5 "more than 6 times"
label values ts_q204 org_grp_activities
label define extent 1 "Not at all" 2 "Small extent" 3 "Moderate extent" 4 "Large extent"
label values ts_q205-ts_q209 extent
label define resources 1 "I do not have the resources I need" 2 "I have some of the resources I need" 3 "I have most of the resources I need" 4 "I have all the resources I need"
label values ts_q210 resources
label define lang_use 1 "Always Chichewa" 2 "Mostly Chichewa" 3 "Similar amount of bothe English and Chichewa" 4 "Mostly English" 5 "Always English"
label values ts_q211-ts_q213 lang_use
label define lang_teach 1 "Chichewa" 2 "English"
label values ts_q214 lang_teach
*/

keep summer_teacher_id ts* ts_* tfu_*

save "$cleaned_dir/teacher_cleaned.dta", replace


********************************************************************************
*			9. 2017 Term Score
********************************************************************************
* 9. open the 2017 Term score data; prefix is "fs_"
use "$rawdata_dir/2017-2018 1term exam score/2017_examscore_final.dta", clear

destring new_std, replace

* Create indicator for pass
gen fs_pass=1 if fs_passorfail=="PASS" |  fs_passorfail=="2" | fs_passorfail=="3"
replace fs_pass=0 if fs_passorfail=="FAIL" |  fs_passorfail=="1" 
drop fs_passorfail

* additional labeling 
label variable fs_sci "science score for 1 term 2017-2018"
label variable fs_total_std4 "Total score (standard 4) for 1 term 2017-2018"
label variable fs_total_std5 "Total score (standard 5) for 1 term 2017-2018"
label variable fs_total_std6 "Total score (standard 6) for 1 term 2017-2018"

* create standardized test scores by grade
foreach var of varlist fs_eng-fs_total_std6 fs_ssbk {
	egen `var'z_4 = std(`var') if new_std == 4
	egen `var'z_5 = std(`var') if new_std == 5
	egen `var'z_6 = std(`var') if new_std == 6
	egen `var'z = rowmin(`var'z_*)
	local temp "`: var label `var''"
	label variable `var'z "`temp' (standardized by grade)"
	drop `var'z_*
}

generate fs_totalz = fs_total_std6z if new_std == 6
replace fs_totalz = fs_total_std5z if new_std == 5
replace fs_totalz = fs_total_std4z if new_std == 4

save "$cleaned_dir/2017_exam_cleaned.dta", replace

********************************************************************************
*			10. 2017-2018 2nd Term Quiz & Survey
********************************************************************************
* 10. open the 2018 Term quiz data; prefix is "f2_"
use "$rawdata_dir/2017-2018 2term quiz & survey/2017-2018 2term quiz&survey.dta", clear

* Additional value labeling
lab def f2_survey 1 "Totally Untrue" 2 "Mostly Untrue" 3 "Somewhat Untrue" 4 "Mostly True" 5 "Totally True" 99 "Don't Know", replace
lab val survey_* f2_survey

* Replacing grade errors
replace std=5 if std==1 | std==2

* Replacing 99 to missing for survey Qs
forval i=1/18 {
	replace survey_`i'=. if survey_`i'==99
	}

* Creating variables
//total quiz score
egen quiz_total=rowtotal(quiz_*), missing
lab var quiz_total "Quiz Total Score"

* z-scores by grade
gen quiz_totalz=.
sum quiz_total if std==5
replace quiz_totalz=(quiz_total-r(mean))/r(sd) if std==5
sum quiz_total if std==6
replace quiz_totalz=(quiz_total-r(mean))/r(sd) if std==6
bys std: sum quiz_totalz
lab var quiz_totalz "Quiz Total Score, standardized by grade"
	
//total quiz score separated by whether covered in summer school or not
egen quiz_total_summer=rowtotal(quiz_1 quiz_2 quiz_3 quiz_4 quiz_5), missing
replace quiz_total_summer=quiz_total_summer+quiz_7 if std==5
replace quiz_total_summer=quiz_total_summer+quiz_10 if std==6
lab var quiz_total_summer "Quiz Total Score, taught in summer school"

gen quiz_total_nosummer=quiz_total-quiz_total_summer
lab var quiz_total_nosummer "Quiz Total Score, not taught in summer school"
	
* z-scores by grade
foreach i in summer nosummer {
	gen quiz_total_`i'z=.
	sum quiz_total_`i' if std==5
	replace quiz_total_`i'z=(quiz_total_`i'-r(mean))/r(sd) if std==5
	sum quiz_total_`i' if std==6
	replace quiz_total_`i'z=(quiz_total_`i'-r(mean))/r(sd) if std==6
	bys std: sum quiz_total_`i'z
	lab var quiz_total_`i'z "Quiz Total Score,`i' in summer school, standardized by grade"
}

//"Like to study..." total score
egen like_study_total=rowtotal(survey_1 survey_2 survey_3 survey_4 survey_5 survey_6), missing
egen like_study_total_z=std(like_study_total)
lab var like_study_total "Like to study, total score"
lab var like_study_total_z "Like to study, total score, standardized"

//"Study hard..." total score
egen study_hard_total=rowtotal(survey_7 survey_8 survey_9 survey_10 survey_11 survey_12), missing
egen study_hard_total_z=std(study_hard_total)
lab var study_hard_total "Study hard, total score"
lab var study_hard_total_z "Study hard, total score, standardized"

//studying with friends total score
egen study_frnd_total=rowtotal(survey_13 survey_14), missing
egen study_frnd_total_z=std(study_frnd_total)
lab var study_frnd_total "studying with friends total score"
lab var study_frnd_total_z "studying with friends total score, standardized"

//study expectations/ attitude
egen study_attd_total=rowtotal(survey_15 survey_16), missing
egen study_attd_total_z=std(study_attd_total)

//parents help study
egen study_parents_total=rowtotal(survey_17 survey_18), missing
egen study_parents_total_z=std(study_parents_total)


//Dummy variables for all untrue/true 1-5 scale responses (1=mostly or totally true 0=somewhat/mostly/totally untrue)
forval i=1/18 {
	gen survey_`i'_dum=.
	replace survey_`i'_dum=1 if survey_`i'==4 | survey_`i'==5
	replace survey_`i'_dum=0 if survey_`i'==1 | survey_`i'==2 | survey_`i'==3
	lab var survey_`i'_dum "responded mostly/totally true for Q`i'"
	tab survey_`i'_dum, m
	}

* Dropping rollcall variables
drop code_rc school_name std_rc class gender firstname_rc surname_rc othername_rc villagename dd mm yyyy AGEyear lang3 summeroffer3

* Renaming variables
ren * f2_*
ren f2_id id
	
save "$cleaned_dir/2018_quiz_cleaned.dta", replace

********************************************************************************
*			11. 2017-2018 2nd Term Exam Scores
********************************************************************************
* 11. open the 2018 2nd Term score data; prefix is "t2e_"
use "$rawdata_dir/2017-2018 2term exam score/2018 term2 progress_exam_merged_final.dta", clear

compress *

* Labeling variable names
lab var code_rc "school code from rollcall"
lab var school_name "school name from rollcall"
lab var std_rc "standard from rollcall"
lab var class "class of learner"
lab var gender "gender of learner"
lab var firstname "firstname from rollcall"
lab var surname "surname from rollcall"
lab var name_rc "firstname+surname from rollcall"
lab var othername "othername (other than firstname and surname)"
lab var villagename "the name of living village"
lab var dd "date of birth"
lab var mm "month of birth"
lab var yyyy "year of birth"
lab var AGEyear "age in year"
lab var lang3 "language class"
lab var summeroffer3 "summer school participance"
lab var old_std "standard of 2016-2017 year "
lab var code_progress "school code from progress note"
lab var std_progress "standard from progress note"
lab var mark_math "mark of mathematics from progress note"
lab var grd_math "grade of mathematics from progress note"
lab var mark_social "mark of social studies from progress note"
lab var grd_social "grade of social studies from progress note"
lab var mark_bible "mark of bible knowledges from progress note"
lab var grd_bible "grade of bible knowledges from progress note"
lab var mark_eng "mark of english  from progress note"
lab var grd_eng "grade of english from progress note"
lab var mark_chi "mark of chichewa from progress note"
lab var grd_chi "grade of chichewa from progress note"
lab var mark_sci "mark of primary science from progress note"
lab var grd_sci "grade of primary science from progress note"
lab var mark_art "mark of arts and life skills  from progress note"
lab var grd_art "grade of arts and life skills from progress note"
lab var mark_total "total mark from progress note(written by teachers)"
lab var grd_total "total grade from progress note"
lab var passorfail "pass or fail from progress note"
lab var math5_name "firstname+surname from std5 math exam entry"
lab var math5_code "school code from std5 math exam entry"
lab var std_exam "standard from exam paper"
lab var math5_total "mark of std5 math from exam entry"
lab var math5_q1 "1. Addition  K70000.00 + K12480.00 + 14200.75"
lab var math5_q2 "2. Subtraction  K87950.00 - K50400.00"
lab var math5_q3 "3. A football player received K7890 each week. How much did he receive in 6 weeks?"
lab var math5_q4 "4. Identify the figure below"
lab var math5_q5 "5. Which of the following shapes show line of symmetry?"
lab var math5_q6 "6. Change 8610 meters into kilometers and meters."
lab var math5_q7 "7. Change 1746 grams to kilograms and grams."
lab var math5_q8 "8. Change the following fractions to decimals; 45/100"
lab var math5_q9 "9. K85500 divided by 25"
lab var math5_q10 "10. A company bought 75 bags of groundnuts for K98925. What was the price of each bag?"
lab var math5_q11 "11. Find the equivalent fraction for 2/5"
lab var math5_q12 "12. Add the following common fractions 1/3 + 1/3"
lab var math5_q13 "13. Write down the fraction for the shaded part of the diagram."
lab var math5_q14 "14. Change 0.5 to fraction"
lab var math5_q15 "15. Multiplication 32705*27"
lab var math5_q16 "16. Add 3/10 and 1/5"
lab var math5_q17 "17. Write the numbers which have been modeled below (Abacus):"
lab var math5_q18 "18. Write five hundred and ten thousand two hundred and twenty three in figures."
lab var math5_q19 "19. Add 61072, 379610 and 160089 together."
lab var math5_q20 "20. Take away K30500 from K61600"
lab var math5_q21a "21. a. Express 9/8 as a decimal number"
lab var math5_q21b "21. b. Draw the abacus for 0.23"
lab var math5_q21c "21. c. Simplify 2/3 + 1/9"
lab var math5_q22a "22. a. K9495 divided 9"
lab var math5_q22b "22. b. Work out the bill for the following budget. 10 packets of sugar at K55 00t each, 15 tablets of toilet soap at K27 05t each, 12 packets of salt at K15 40t each, 30 pens at K10 00t each, 8 exercise books at K17 00t each."
lab var math5_q23a "23. a. Complete the following equivalent fraction 2/3 + 8/_"
lab var math5_q23b "23. b. Simplify 8/12 to its lowest form"
lab var math5_q24a "24. a. Draw and shade a diagram to show 6/8"
lab var math5_q24b "24. b. Write 60100 in words"
lab var social5_name "firstname+surname from std5 social studies exam entry"
lab var social5_code "school code from std5 social studies exam entry"
lab var social5_total "mark of std5 social studies from exam entry"
lab var social5_q1 "1. Which of the following is an example of landforms?"
lab var social5_q2 "2. In which region is Lilongwe River? "
lab var social5_q3 "3. The following are the oldest famiy group of the Nkhamanga kingdom EXCEPT"
lab var social5_q4 "4. Which of the following was the  leader of the Nkhamanga kingdom?"
lab var social5_q5 "5. Which of the following is one of the effects of population change?"
lab var social5_q6 "6. Which of the folllowing is one of the functions traditional structure?"
lab var social5_q7 "7. Which of the following is an example of a city assembly?"
lab var social5_q8 "8. Which of the following is not a type of soil?"
lab var social5_q9 "9. Mention the two contributions of the Nkhamanga kingdom to present day Malawi?"
lab var social5_q10 "10. Where did the Maravi come from?"
lab var social5_q11 "11. Which of the following Malawians showed bravery and courage and is alive?"
lab var social5_q12 "12. How did the Nkhamanga earn their living?"
lab var social5_q13 "13. The ability to use, control or direct something or someone is called _________"
lab var social5_q14 "14. Which of the following countries does not share boarders to Malawi? "
lab var social5_q15 "15. In which district is your school?"
lab var social5_q16a "16. a. Explain any two importances of Physical features"
lab var social5_q16b "16. b. Explain any two importances of Physical features"
lab var social5_q17a "17. a. Identify three uses of soil?"
lab var social5_q17b "17. b. Identify three uses of soil?"
lab var social5_q17c "17. c. Identify three uses of soil?"
lab var social5_q18a "18. a. Explain any two main reasons for the Maravi migration"
lab var social5_q18b "18. b. Explain any two main reasons for the Maravi migration"
lab var social5_q19a "19. a. Give any two examples of lakes in Malawi"
lab var social5_q19b "19. b. Give any two examples of lakes in Malawi"
lab var social5_q20 "20. What is population change"
lab var social5_q21 "21. Every district is headed by a ___________"
lab var social5_q22a "22. a. Mention any two factors that led to the growth of early kingdoms in Malawi."
lab var social5_q22b "22. b. Mention any two factors that led to the growth of early kingdoms in Malawi."
lab var social5_q23a "23. a. Name any three means of communication in your district."
lab var social5_q23b "23. b. Name any three means of communication in your district."
lab var social5_q23c "23. c. Name any three means of communication in your district."
lab var math6_name "firstname+surname from std6 math studies exam entry"
lab var math6_code "school code from std6 math studies exam entry"
lab var math6_total "mark of std6 math from exam entry"
lab var math6_q1 "1. Arrange the following numbers in ascending order:"
lab var math6_q2 "2. Find the product of 9149 and 343"
lab var math6_q3 "3. Find the HCF of 8 and 34 using continued division method"
lab var math6_q4 "4. The cost of producing 8000 tablets of soap is K4000. What is the rate of production of the tablets of soap?"
lab var math6_q5 "5. A builder is paid K3000 for working for 5 hours. What is his pay rate per hour?"
lab var math6_q6 "6. Simplify the following ratio to its simplest form 1/8:1/4"
lab var math6_q7 "7. A shopkeeper gave 2 free pencils for every 6 books bought. What was the ratio of pencils to books?"
lab var math6_q8 "8. Express the following ratio as a fraction 13:16"
lab var math6_q9 "9. Add the following 0.81 + 1.28"
lab var math6_q10 "10. 0.157 + 0.218 + 0.501 ="
lab var math6_q11 "11. How many times does 568 go into 5285240?"
lab var math6_q12 "12. Find the perimeter of the rectangle below: Length = 8cm Width = 5cm"
lab var math6_q13 "13. Simplify 3/4 x 2/9"
lab var math6_q14 "14. In a tray of 30 eggs, 20 eggs are good and the rest are bad. How many eggs are bad?"
lab var math6_q15 "15. Express the following number to 2 decimal places 70.132"
lab var math6_q16 "16.Change the following to decimal notation K127 49t"
lab var math6_q17 "17. What is the LCM of the following: 40, 48 and 64"
lab var math6_q18 "18. Divide the following number: 2370375 by 175"
lab var math6_q19 "19. Write seven million, four hundred and two thousand and four in figure."
lab var math6_q20 "20. The factors of 48 are:"
lab var math6_q21 "21. What is the cost of one bag of beans, if 248 bags of beans cost K59272?"
lab var math6_q22 "22. Chisomo covered 4KM 400M in 2 hours. Find Chisomo's rate in KM per hour."
lab var math6_q23a "23. a. Change the following mixed numbers to improper fraction. 3/3/4"
lab var math6_q23b "23. b. Change the following mixed numbers to improper fraction. 8/1/2"
lab var math6_q24 "24. Calculate below. 130.203 - 12.789"
lab var math6_q25 "25. Find the total cost of the following items. 4 bottles of soft drinks, 8 mathematics books, 3 knives and 7 rulers."
lab var math6_q26a "26. a. Indentify the following angles"
lab var math6_q26b "26. b. Indentify the following angles"
lab var math6_q27a "27. a. Find the cost of 12 wrist watches if one costs K102.50t each."
lab var math6_q27b "27. b. Calcuate below. 4017.20 - 2869.69"
lab var social6_name "firstname+surname from std6 social studies exam entry"
lab var social6_code "school code from std6 social studies exam entry"
lab var social6_total "mark of std6 social studies from exam entry"
lab var social6_q1 "1. Which of the following is one of thr effects of elements of weather?"
lab var social6_q2 "2. Where did the Ngoni come from?"
lab var social6_q3 "3. Which of the following is one of the major factors that influence the climate of Malawi?"
lab var social6_q4 "4. Making by-laws is one of functions of _________"
lab var social6_q5 "5. What name is given to the removal of top soil? "
lab var social6_q6 "6. Which of the following is a type of soil erosion?"
lab var social6_q7 "7. One of the fishing grounds in Malawi is ______"
lab var social6_q8 "8. The following are problems associated with fishing in Malawi."
lab var social6_q9 "9. Which of the following is a misconception about HIV/AIDS?"
lab var social6_q10 "10. In which months does Malawi experience cool and dry season?"
lab var social6_q11 "11. Which element of weather is used to measure wind speed?"
lab var social6_q12 "12. Which of the following is one of the causes of population increase in Malawi?"
lab var social6_q13 "13. The average weather condition of an area observed over a long period of time is called _______."
lab var social6_q14 "14. The two components of the environment are as follows:"
lab var social6_q15 "15. To which type of forests does M'sangu and M'thethe type of trees belong to?"
lab var social6_q16 "16. What is environment?"
lab var social6_q17a "17. a. Give two examples of social environment."
lab var social6_q17b "17. b. Give two examples of social environment."
lab var social6_q18a "18. a. Describe two importances of forests."
lab var social6_q18b "18. b. Describe two importances of forests."
lab var social6_q19a "19. a. Figure 1 is a picture showing destructive agents of forests. Use it to answer questions that follow. Identify the activity"
lab var social6_q19b "19. b. Figure 1 is a picture showing destructive agents of forests. Use it to answer questions that follow. How can the behaviour above promote deforestation?"
lab var social6_q20 "20. In which season do we expect rains in Malawi?"
lab var social6_q21a "21. a. Figure 2 is a diagram showing one of the instruments for measuring elements of weather. Use it to answer the questions that follow. Identify the instrument above"
lab var social6_q21b "21. b. Figure 2 is a diagram showing one of the instruments for measuring elements of weather. Use it to answer the questions that follow. Describe the use of the instrument above"
lab var social6_q22 "22. Name the powerful god among the mwenemutapa kingdom."

* replacing errors in std; over 50% of the teacher's note on the pupil's progress is missing.  
replace std_progress=5 if std_rc==5 & std_exam==5
replace std_progress=6 if std_rc==6 & std_exam==6

* replacing errors in pass or fail
replace passorfail=. if passorfail==2

* creating social studies + bible knowledge score
gen mark_ssbk=mark_social + mark_bible

* creating z-scores for marks from progress note
foreach i in mark_math mark_social mark_bible mark_ssbk mark_eng mark_chi mark_sci mark_art mark_total {
	gen `i'z=.
	sum `i' if std_progress==5
	replace `i'z=(`i'-r(mean))/r(sd) if std_progress==5
	sum `i' if std_progress==6
	replace `i'z=(`i'-r(mean))/r(sd) if std_progress==6
	bys std_progress: sum `i'z
	lab var `i'z "`i' from progress note, standardized by grade"
	}

* creating z-scores for marks from exam
foreach i in math5_total social5_total math6_total social6_total {
	gen `i'z =.
	sum `i' 
	replace `i'z=(`i'-r(mean))/r(sd) 
	sum `i'z
	lab var `i'z "`i' from exam sheet, standardized"
	}
	* aggregating the two variables
	foreach i in math social {
		gen `i'_totalz=.
		replace `i'_totalz=`i'5_totalz if std_exam==5
		replace `i'_totalz=`i'6_totalz if std_exam==6
		lab var `i'_totalz "`i' from exam sheet, standardized by grade"
		}
	
* Creating whether correct dummies for each question (0=wrong 2=correct)
foreach i in math5_q2 math5_q3 math5_q10 math5_q12 math5_q16 math5_q18 social5_q2 social5_q5 social5_q7 social5_q12 math6_q2 math6_q7 math6_q12 math6_q13 math6_q15 math6_q17 social6_q3 social6_q6 social6_q14 {
	gen `i'_dum=`i'==1
	replace `i'_dum=. if `i'==. 
	replace `i'_dum=2 if `i'_dum==1
	}
foreach i in math5_q1 math5_q6 math5_q7 math5_q8 math5_q9 math5_q13 math5_q15 social5_q1 social5_q11 social5_q14 math6_q1 math6_q6 math6_q8 math6_q16 math6_q18 social6_q1 social6_q9 social6_q12 {
	gen `i'_dum=`i'==2
	replace `i'_dum=. if `i'==.
	replace `i'_dum=2 if `i'_dum==1
	}
foreach i in math5_q4 math5_q17 social5_q6 social5_q8 social5_q9 social5_q10 social5_q15 math6_q3 math6_q5 math6_q10 math6_q11 math6_q19 social6_q4 social6_q5 social6_q8 social6_q13 social6_q15 {
	gen `i'_dum=`i'==3
	replace `i'_dum=. if `i'==.
	replace `i'_dum=2 if `i'_dum==1
	}
foreach i in math5_q5 math5_q11 math5_q19 math5_q20 social5_q3 social5_q4 social5_q13 math6_q9 math6_q14 math6_q20 social6_q2 social6_q7 social6_q10 social6_q11 {
	gen `i'_dum=`i'==4
	replace `i'_dum=. if `i'==.
	replace `i'_dum=2 if `i'_dum==1
	}
gen math5_q14_dum=2			//marking everything right for the Q with no answer
replace math5_q14_dum=. if math5_q14==.

* Replacing 99 marks to missing in open answer Qs
foreach i in math5_q21a math5_q21b math5_q21c math5_q22a math5_q22b math5_q23a math5_q23b math5_q24a math5_q24b social5_q16a social5_q16b social5_q17a social5_q17b social5_q17c social5_q18a social5_q18b social5_q19a social5_q19b social5_q20 social5_q21 social5_q22a social5_q22b social5_q23a social5_q23b social5_q23c math6_q21 math6_q22 math6_q23a math6_q23b math6_q24 math6_q25 math6_q26a math6_q26b math6_q27a math6_q27b social6_q16 social6_q16 social6_q17a social6_q17b social6_q18a social6_q18b social6_q19a social6_q19b social6_q20 social6_q21a social6_q21b social6_q22 {
	replace `i'=. if `i'==99
	}

* Creating total marks based on each Q marks
egen math5_total_calc=rowtotal(math5_*_dum math5_q21a math5_q21b math5_q21c math5_q22a math5_q22b math5_q23a math5_q23b math5_q24a math5_q24b), missing
egen social5_total_calc=rowtotal(social5_*_dum social5_q16a social5_q16b social5_q17a social5_q17b social5_q17c social5_q18a social5_q18b social5_q19a social5_q19b social5_q20 social5_q21 social5_q22a social5_q22b social5_q23a social5_q23b social5_q23c), missing
egen math6_total_calc=rowtotal(math6_*_dum math6_q21 math6_q22 math6_q23a math6_q23b math6_q24 math6_q25 math6_q26a math6_q26b math6_q27a math6_q27b), missing
egen social6_total_calc=rowtotal(social6_*_dum social6_q16 social6_q17a social6_q17b social6_q18a social6_q18b social6_q19a social6_q19b social6_q20 social6_q21a social6_q21b social6_q22), missing
	
	* z-scores
	foreach i in math5 social5 math6 social6 {
		gen `i'_total_calcz=.
		sum `i'_total_calc 
		replace `i'_total_calcz=(`i'_total_calc-r(mean))/r(sd) 
		sum `i'_total_calcz
		lab var `i'_total_calcz "`i' exam total score, standardized"
		}
		
	* aggregating the 2 variables
	foreach i in math social {
		gen `i'_total_calcz=.
		replace `i'_total_calcz=`i'5_total_calcz if std_exam==5
		replace `i'_total_calcz=`i'6_total_calcz if std_exam==6
		lab var `i'_total_calcz "`i' exam total score, standardized by grade"
		}
		
	
* Creating separate total marks for Qs taught/not taught in summer school 
egen math5_total_summer=rowtotal(math5_q9_dum math5_q11_dum math5_q12_dum math5_q13_dum math5_q15_dum math5_q16_dum math5_q17_dum math5_q18_dum math5_q19_dum math5_q21b math5_q21c math5_q24a math5_q24b), missing
egen social5_total_summer=rowtotal(social5_q1_dum social5_q2_dum social5_q3_dum social5_q4_dum social5_q5_dum social5_q8_dum social5_q9_dum social5_q10_dum social5_q11_dum social5_q12_dum social5_q13_dum social5_q15_dum social5_q16a social5_q16b social5_q17a social5_q17b social5_q17c social5_q18a social5_q18b social5_q19a social5_q19b social5_q22a social5_q22b social5_q23a social5_q23b social5_q23c), missing
egen math6_total_summer=rowtotal(math6_q1_dum math6_q2_dum math6_q3_dum math6_q11_dum math6_q17_dum math6_q18_dum math6_q19_dum math6_q20_dum), missing
egen social6_total_summer=rowtotal(social6_q2_dum social6_q3_dum social6_q9_dum social6_q12_dum social6_q17a social6_q17b social6_q18a social6_q18b social6_q22), missing

foreach i in math5 social5 math6 social6 {
	gen `i'_total_nosummer=`i'_total_calc-`i'_total_summer
	}
	
	* z-scores 
	foreach i in math social {
		foreach j in summer nosummer {
			gen `i'_total_`j'z=.
			sum `i'5_total_`j' if std_exam==5
			replace `i'_total_`j'z=(`i'5_total_`j'-r(mean))/r(sd) if std_exam==5
			sum `i'6_total_`j' if std_exam==6
			replace `i'_total_`j'z=(`i'6_total_`j'-r(mean))/r(sd) if std_exam==6
			lab var `i'_total_`j'z "`i' exam total score from exam sheet, `j', std'zed by grade"
			}
			}
	
	
* Dropping duplicates
duplicates list id
bro if id==1721355	
drop if id==1721355 & math6_total_calc==46
drop if id==1721355 & math6_q26a==.
	
* Dropping rollcall variables
drop code_rc school_name std_rc class gender firstname surname name_rc othername villagename dd mm yyyy AGEyear lang3 summeroffer3 old_std
	
* Renaming variables
ren * t2e_*
ren t2e_id id

save "$cleaned_dir/2018_exam_cleaned.dta", replace
